packages
pacman::p_load(dplyr, ggplot2, googlesheets, openxlsx, stringr, rvest, dplyr, ggplot2, h2o, caret, text2vec)
Installing package into 㤼㸱C:/Users/Workplace/Documents/R/win-library/3.4㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
also installing the dependencies 㤼㸱lambda.r㤼㸲, 㤼㸱futile.options㤼㸲, 㤼㸱RcppParallel㤼㸲, 㤼㸱data.table㤼㸲, 㤼㸱irlba㤼㸲, 㤼㸱futile.logger㤼㸲, 㤼㸱mlapi㤼㸲, 㤼㸱sparsepp㤼㸲
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/lambda.r_1.2.zip'
Content type 'application/zip' length 93201 bytes (91 KB)
downloaded 91 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/futile.options_1.0.0.zip'
Content type 'application/zip' length 17090 bytes (16 KB)
downloaded 16 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/RcppParallel_4.3.20.zip'
Content type 'application/zip' length 3378535 bytes (3.2 MB)
downloaded 3.2 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/data.table_1.10.4-3.zip'
Content type 'application/zip' length 1577200 bytes (1.5 MB)
downloaded 1.5 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/irlba_2.3.2.zip'
Content type 'application/zip' length 279956 bytes (273 KB)
downloaded 273 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/futile.logger_1.4.3.zip'
Content type 'application/zip' length 95441 bytes (93 KB)
downloaded 93 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/mlapi_0.1.0.zip'
Content type 'application/zip' length 72925 bytes (71 KB)
downloaded 71 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/sparsepp_0.2.0.zip'
Content type 'application/zip' length 144909 bytes (141 KB)
downloaded 141 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/text2vec_0.5.1.zip'
Content type 'application/zip' length 5140413 bytes (4.9 MB)
downloaded 4.9 MB
package ‘lambda.r’ successfully unpacked and MD5 sums checked
package ‘futile.options’ successfully unpacked and MD5 sums checked
package ‘RcppParallel’ successfully unpacked and MD5 sums checked
package ‘data.table’ successfully unpacked and MD5 sums checked
package ‘irlba’ successfully unpacked and MD5 sums checked
package ‘futile.logger’ successfully unpacked and MD5 sums checked
package ‘mlapi’ successfully unpacked and MD5 sums checked
package ‘sparsepp’ successfully unpacked and MD5 sums checked
package ‘text2vec’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\Workplace\AppData\Local\Temp\RtmpGsflLY\downloaded_packages
text2vec installed
package 㤼㸱text2vec㤼㸲 was built under R version 3.4.3
load data
df <- get(load("sub_sample2.Rdata"))
clean
clean_social_media <- function(x){
x %>%
str_replace_all("\n", " ") %>%
str_to_lower() %>%
### Twitter specific
str_replace_all("https?[:]//[[:graph:]]+", "URL") %>%
str_replace_all("@(\\w+)", " HNDL") %>%
str_replace_all("#(\\w+)", " HASH") %>%
### ALtright replacements
str_replace_all("\\(+.?", "JEW ") %>%
str_replace_all("\\)+", " ") %>%
### smilies
str_replace_all(":-\\)|:\\)|\\(:|\\(-:", " EMO_SMILEY ") %>%
str_replace_all(":-D|:D|X-D|XD|xD", " EMO_LAUGH ") %>%
str_replace_all("<3|:\\*", "EMO_LOVE") %>%
str_replace_all(";-\\)|;\\)|;-D|;D|\\(;|\\(-;", "EMO_WINK") %>%
str_replace_all(":-\\(|:\\(|\\):|\\)-:", "EMO_FROWN") %>%
str_replace_all(':,\\(|:"\\(|:\\(\\(', "EMO_CRY") %>%
### General
str_replace_all("\\.|\\:|\\;", " PUNC_DOT ") %>%
str_replace_all("\\!", " PUNC_EXCL ") %>%
str_replace_all("\\?", " PUNC_QUES ") %>%
str_replace_all("\\.\\.\\.", " PUNC_DOTS ") %>%
str_trim()
}
df <- df %>%
mutate(ctext = clean_social_media(text))
package 㤼㸱bindrcpp㤼㸲 was built under R version 3.4.3
vectorize
load("vectorizer.Rdata")
### test
pred_it <- itoken(
df$ctext,
ids = df$id,
progressbar = F
)
pred_dtm <- create_dtm(pred_it, vectorizer)
predict
df_pred %>% split(., .[,"sp"])
$`1`
$`2`
$`3`
$`4`
$`5`
$`6`
$`7`
$`8`
$`9`
$`10`
$`11`
$`12`
$`13`
$`14`
$`15`
$`16`
$`17`
$`18`
$`19`
$`20`
NA
LS0tDQp0aXRsZTogIlByZWRpY3Qgb24gdW5zZWVuIGRhdGEiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCiMjIHBhY2thZ2VzDQoNCmBgYHtyfQ0KcGFjbWFuOjpwX2xvYWQoZHBseXIsIGdncGxvdDIsIGdvb2dsZXNoZWV0cywgb3Blbnhsc3gsIHN0cmluZ3IsIHJ2ZXN0LCBkcGx5ciwgZ2dwbG90MiwgaDJvLCBjYXJldCwgdGV4dDJ2ZWMpDQpgYGANCg0KDQojIyBsb2FkIGRhdGENCg0KYGBge3J9DQpkZiA8LSBnZXQobG9hZCgic3ViX3NhbXBsZTIuUmRhdGEiKSkNCmBgYA0KDQoNCiMjIGNsZWFuIA0KDQpgYGB7cn0NCmNsZWFuX3NvY2lhbF9tZWRpYSA8LSBmdW5jdGlvbih4KXsNCg0KICB4ICU+JQ0KICAgIHN0cl9yZXBsYWNlX2FsbCgiXG4iLCAiICIpICU+JQ0KICAgIHN0cl90b19sb3dlcigpICU+JQ0KICAgICMjIyBUd2l0dGVyIHNwZWNpZmljDQogICAgc3RyX3JlcGxhY2VfYWxsKCJodHRwcz9bOl0vL1tbOmdyYXBoOl1dKyIsICJVUkwiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIkAoXFx3KykiLCAiIEhOREwiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIiMoXFx3KykiLCAiIEhBU0giKSAlPiUNCiAgICAjIyMgQUx0cmlnaHQgcmVwbGFjZW1lbnRzDQogICAgc3RyX3JlcGxhY2VfYWxsKCJcXCgrLj8iLCAiSkVXICIpICU+JQ0KICAgIHN0cl9yZXBsYWNlX2FsbCgiXFwpKyIsICIgIikgJT4lDQogICAgIyMjIHNtaWxpZXMNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjotXFwpfDpcXCl8XFwoOnxcXCgtOiIsICIgRU1PX1NNSUxFWSAiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjotRHw6RHxYLUR8WER8eEQiLCAiIEVNT19MQVVHSCAiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjwzfDpcXCoiLCAiRU1PX0xPVkUiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjstXFwpfDtcXCl8Oy1EfDtEfFxcKDt8XFwoLTsiLCAiRU1PX1dJTksiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjotXFwofDpcXCh8XFwpOnxcXCktOiIsICJFTU9fRlJPV04iKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoJzosXFwofDoiXFwofDpcXChcXCgnLCAiRU1PX0NSWSIpICU+JQ0KICAgICMjIyBHZW5lcmFsDQogICAgc3RyX3JlcGxhY2VfYWxsKCJcXC58XFw6fFxcOyIsICIgUFVOQ19ET1QgIikgJT4lDQogICAgc3RyX3JlcGxhY2VfYWxsKCJcXCEiLCAiIFBVTkNfRVhDTCAiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIlxcPyIsICIgUFVOQ19RVUVTICIpICU+JQ0KICAgIHN0cl9yZXBsYWNlX2FsbCgiXFwuXFwuXFwuIiwgIiBQVU5DX0RPVFMgIikgJT4lDQogICAgIyMjIFdoaXRlIFNwYWNlDQogICAgc3RyX3JlcGxhY2VfYWxsKCJcXHMrIiwgIiAiKSAlPiUNCiAgICBzdHJfdHJpbSgpIA0KfQ0KDQoNCmRmIDwtIGRmICU+JQ0KICBtdXRhdGUoY3RleHQgPSBjbGVhbl9zb2NpYWxfbWVkaWEodGV4dCkpDQpgYGANCg0KDQoNCiMjIHZlY3Rvcml6ZQ0KDQpgYGB7cn0NCmxvYWQoInZlY3Rvcml6ZXIuUmRhdGEiKQ0KIyMjIHRlc3QNCnByZWRfaXQgPC0gaXRva2VuKA0KICBkZiRjdGV4dCwgDQogIGlkcyA9IGRmJGlkLA0KICBwcm9ncmVzc2JhciA9IEYNCikNCg0KcHJlZF9kdG0gPC0gY3JlYXRlX2R0bShwcmVkX2l0LCB2ZWN0b3JpemVyKQ0KYGBgDQoNCg0KIyMgcHJlZGljdA0KDQpgYGB7cn0NCmxpYnJhcnkoaDJvKQ0KIyMjIGluaXRpYWxpemUgYW4gaDJvIGluc3RhbmNlDQpoMm8uaW5pdChudGhyZWFkcyA9IC0xKQ0KZ2JtX2Jhc2UgPC0gaDJvLmxvYWRNb2RlbCgiR0JNX21vZGVsX1JfMTUxODA0MTM5MjY3Nl8xIikNCiMgZGV2dG9vbHM6Omluc3RhbGxfZ2l0aHViKCJoMm9haS9oMm8tMy9oMm8tci9lbnNlbWJsZS9oMm9FbnNlbWJsZS1wYWNrYWdlIikNCmgyb19wcmVkIDwtIGFzLmgybyhwcmVkX2R0bSkNCnByZWQxIDwtIGgyby5wcmVkaWN0KGdibV9iYXNlLCBoMm9fcHJlZCkgJT4lDQogIGFzLmRhdGEuZnJhbWUoKQ0KDQpkZl9wcmVkIDwtIGRhdGEuZnJhbWUoZGYsIHByZWQxKQ0KDQoNCmRmX3ByZWQgPC0gZGZfcHJlZCAlPiUgDQogIGZpbHRlcihwcmVkaWN0ID09IDEpICU+JSANCiAgbXV0YXRlKHNwID0gbnRpbGUoaWQsIG4gPSAxMCkpICU+JSANCiAgYXJyYW5nZShyYW4gPSBybm9ybShuKCkpKSANCiMgDQojIGZvcihqaiBpbiAxOjEwKXsNCiMgICBzYXZlKGRmX3ByZWQgJT4lIGZpbHRlcihzcCA9PSBqaiksIGZpbGUgPSAiIikNCiMgfQ0KDQpkZl9saXN0IDwtIGRmX3ByZWQgJT4lIHNwbGl0KC4sIC5bLCJzcCJdKQ0Kc2F2ZShkZl9saXN0LCBmaWxlID0gImRmX2xpc3QuUmRhdGEiKQ0KYGBgDQoNCg0K